## Mikael Poul Johannesson
## Febraury 2016

## Funtions for estimatig and plotting multiple stm_models. Mostly adapted for relatively
## small n of topics (max, say, 15/20).

pkgs <- c("stm", "dplyr", "ggplot2", "tidyr", "gridExtra", "RGraphics", "ineq", "qdap", "GGally", "rmarkdown", "tokenizers")
for (pkg in pkgs) if (!require(pkg, character.only = TRUE)) install.packages(pkg)

## load("data/ncp-prepared-data.RData")
## load("output/corpus.RData")
## load("output/stm_models.RData")

ci <- function(x) return(qnorm(0.975) * (sd(na.omit(x)) / sqrt(length(na.omit(x)))))

##
## Print functions  ----------------------------------------------------------------------------------
##

texts_to_document <- function(texts, meta = NULL, filename, output = "word", title, add = c("age", "gender", "education", "interest")) {

  add_meta <- function(texts, metadata, add) {
    if (!is.null(metadata)) {
      header <- sapply(add, function(x) paste0("[", x, ": ", metadata[, x], "]"))
      header <- apply(header, 1, paste0, collapse = "")
      header <- paste0(header, "\n")
      out <- paste0("* ", texts, "^", header, "^\n")
    } else {
      out <- paste0("* ", texts, "\n")
    }
    return(out)
  }

  if (output == "pdf") {
    output_type <- "pdf_document\nheader-includes:\n    - \\usepackage[utf8]{inputenc}"
  } else if (output == "html") {
    output_type <- "html_document"
  } else {
    output <- "docx"
    output_type <- "word_document"
  }

  banner <- paste0("---\n",
                   'title: "', title, '"\n',
                   "output: ", output_type, "\n",
                   "---\n\n")
  to_print <- enc2utf8(add_meta(texts, meta, add))
  all <- c(banner, to_print)

  writeLines(all, "texts.Rmd")
  rmarkdown::render("texts.Rmd", output_file = paste0(filename, ".", output), quiet = TRUE)
  invisible(file.remove("texts.Rmd"))
}

## texts_to_document(out$meta$text, out$meta, "climdo-answers", "pdf", "CLIMDO Project: All open-ended answers")

##
## Plotting Functions  ----------------------------------------------------------------------------------
##




##
## Plot Frex
##

plot_frex <- function(fit, n = 15) {
    frex <- data.frame("x" = 0, "topic" = 1:ncol(fit$theta),
                       "frex" = paste0(apply(labelTopics(fit, n = n)$frex, 1, paste, collapse = ", ")))
    frex$topic <- factor(frex$topic, levels = rev(1:ncol(fit$theta)))
    p <- ggplot(frex, aes(x = x, y = topic, label = frex))
    p <- p + geom_text(hjust = 0, size = 2.5) + theme_bw()
    p <- p + scale_x_continuous(limits = c(0, 1), expand = c(.01, .01))
    p <- p + labs(x = "Frex", y = "Topic")
    p <- p + theme(panel.background = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.minor.x = element_blank(),
              panel.grid.minor.y = element_blank(),
              axis.text.x = element_blank(),
              axis.text.y = element_text(size = 9),
              axis.line = element_blank(),
              axis.ticks.x = element_blank(),
              strip.background = element_rect(colour = "grey", fill = "grey", size = 3))
    return(p)
}

##
## Plot sentences
##

plot_texts <- function(fit, texts, split_width = 62, n_sentences = 25) {
    cut <- function(texts, cutoff = 190) {
        cuts <- nchar(texts) >= cutoff
        adds <- rep("", length(texts))
        adds[cuts] <- "(...)"
        return(paste0(substr(texts, 1, cutoff), adds))
    }
    thoughts <- lapply(1:ncol(fit$theta), function(x) {
        out <- strwrap(cut(paste0(1:n_sentences, ": ", findThoughts(fit, texts = texts, n = n_sentences, topics = x)$docs[[1]])), width = split_width, exdent = 5)
        out <- data.frame("topic" = x,
                          "topiclabel" = paste0("Topic ", x, ": ", paste0(labelTopics(fit, n = 3)$frex[x, ], collapse = ", ")),
                          "texts" = paste0(out, collapse = "\n"))
        return(out)
    })
    thoughts <- do.call("rbind", thoughts)
    p <- ggplot(thoughts, aes(x = 0, y = 1, label = texts))
    p <- p + geom_text(hjust = 0, vjust = 1, size = 2.5) ##+ theme_void()
    p <- p + facet_wrap(~ topiclabel, ncol = 3)
    p <- p + scale_x_continuous(limits = c(0, 1), expand = c(.01, .01))
    p <- p + scale_y_continuous(limits = c(0, 1), expand = c(.01, .01))
    p <- p + theme(panel.background = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.minor.x = element_blank(),
              panel.grid.minor.y = element_blank(),
              axis.text = element_blank(),
              axis.line = element_blank(),
              axis.ticks = element_blank(),
              axis.title = element_blank(),
              strip.background = element_rect(colour = "grey", fill = "grey", size = 3))
    return(p)
}

##
## Topic proportions
##

plot_prop <- function(fit) {
    prop <- as.data.frame(fit$theta)
    ##names(prop) <- paste0("Topic ", 1:ncol(fit$theta))
    names(prop) <- 1:ncol(fit$theta)
    prop <- tidyr::gather(prop, "topic", "prop")
    prop$topic <- as.factor(prop$topic)
    prop$topic <- factor(prop$topic, levels = rev(1:ncol(fit$theta)))
    prop <- prop %>%
        group_by(topic) %>%
        summarize(mean = mean(prop),
                  ci = ci(prop),
                  upper = mean + ci,
                  lower = mean - ci)
    p <- ggplot(prop, aes(x = mean, xmin = lower, xmax = upper, y = topic))
    p <- p + geom_point() + guides(colour = FALSE)
    p <- p + geom_segment(aes(yend = topic, x = 0, xend = mean))
    p <- p + scale_x_continuous(limits = c(0, .4))
    p <- p + theme_bw()
    p <- p + labs(y = "Topic", x = "Expected Topic Proportion\n")
    p <- p + theme(axis.title = element_text(size = 9),
                   axis.text =  element_text(size = 9))
    return(p)
}

##
## Plot Gini
##

## Gini theta
plot_gini_theta <- function(fit) {
    gini <- data.frame("gini" = apply(fit$theta, 2, ineq, type = "Gini"),
                       "topic" = as.factor(1:ncol(fit$theta)))
    gini$topic <- factor(gini$topic, levels = rev(1:ncol(fit$theta)))
    p <- ggplot(gini, aes(y = topic, x = gini))
    p <- p + geom_point() + theme_bw()
    p <- p + labs(x = "Gini (inequality) of documents\nwithin topics", y = "Topic")
    p <- p + scale_x_continuous(limits = c(0, .75))
    p <- p + geom_segment(aes(yend = topic, x = 0, xend = gini))
    p <- p + theme(axis.title = element_text(size = 9),
                   axis.text =  element_text(size = 9))
    return(p)
}


## Gini pers
plot_gini_pers <- function(fit) {
    p <- ggplot(data.frame("gini" = apply(fit$theta, 1, ineq, type = "Gini")), aes(x = gini))
    p <- p + geom_density(fill = "grey", alpha = .5) + theme_bw()
    p <- p + geom_vline(aes(xintercept = mean(gini)), linetype = "dashed", colour = "orange")
    p <- p + labs(x = "Inequality of topics\nproportions (Gini)", y = "Density")
    p <- p + theme(axis.title = element_text(size = 9),
                   axis.text =  element_text(size = 9))
    return(p)
}
## plot_gini_pers(fit)
## p_gini_pers <- plot_gini_pers(fit)

##
## Plot Topic Correlations
##

plot_corr <- function(fit) {
    thetas <- as.data.frame(fit$theta)
    names(thetas) <- paste0("T", 1:ncol(fit$theta))
    p <- ggcorr(thetas, name = "Correlation", low = "#3B9AB2", high = "orange", label_alpha = TRUE, label = TRUE, label_size = 2, size = 2)
    p <- p + theme_bw() + theme(legend.position = "none") + coord_cartesian()
    p <- p + labs(x = "\nCorrelation between topics\n(pairwise pearson)", y = "\n")
    p <- p + theme(axis.title = element_text(size = 9),
                   axis.text =  element_text(size = 9))
    return(p)
}
## plot_corr(fit)
## p_corr <- plot_corr(fit)

##
## Plot Overview
##

plot_overview <- function(view, fit, n_run) {
    n_topic <- ncol(fit$theta)
    n_topics_total <- view$topics
    n_runs_kept <- view$runs[1]
    overview <- expand.grid(n_topics_total, 1:n_runs_kept)
    names(overview) <- c("topics", "run")
    overview$current <- "grey80"
    overview$current[overview$topics == n_topic] <- "grey40"
    overview$current[overview$run == n_run] <- "grey40"
    overview[overview$topics == n_topic & overview$run == n_run, "current"] <- "orange"
    for (var in names(overview)) overview[, var] <- as.factor(overview[, var])
    overview$run <- factor(overview$run, levels = rev(levels(overview$run)))
    p <- ggplot(overview, aes(x = topics, y = run, fill = current, colour = current))
    p <- p + geom_tile(colour = "grey", fill = overview$current)
    p <- p + theme_minimal() + guides(fill = FALSE) + labs(x = "N Topics", y = "Run")
    p <- p + theme(axis.title = element_text(size = 9),
                   axis.text =  element_text(size = 9))
    return(p)
}


plot_title <- function(fit, n_run) {
    n_topic <- ncol(fit$theta)
    title_text <- paste0("N Topics: ", n_topic, "\n",
                         "      Run: ", n_run)
    p <- ggplot(data.frame("x" = 0, "y" = 0, "title_text" = title_text), aes(x = x, y = y, label = title_text))
    p <- p + geom_text(size = 6) + theme_void()
    return(p)
}
## plot_title(fit, 1)
## p_title <- plot_title(fit, 1)


##
## Semantic coherence ~ exclusivity
##

plot_semcoh <- function(stm_models, mod, n_run = NULL, global_scale = TRUE) {
    highlight_run <- n_run
    semcoh_exc <- data.frame("run" = as.numeric(sapply(1:length(stm_models[[mod]]$exclusivity), rep, times = length(stm_models[[mod]]$exclusivity[[1]]))),
                             "semcoh" = unlist(stm_models[[mod]]$semcoh), "exc" = unlist(stm_models[[mod]]$exclusivity))
    semcoh_exc$run <- as.factor(semcoh_exc$run)
    if (!is.null(highlight_run)) {
        semcoh_exc$highlight <- "grey"
        semcoh_exc$highlight[semcoh_exc$run == highlight_run] <- "orange"
    } else {
        semcoh_exc$highlight <- semcoh_exc$run
    }
    p <- ggplot(semcoh_exc, aes(x = semcoh, y = exc, colour = highlight, label = run))
    if (!is.null(highlight_run)) {
        p <- p + geom_text(colour = semcoh_exc$highlight)
    } else {
        p <- p + geom_text()
    }
    p <- p + theme_bw() + guides(colour = FALSE, label = FALSE)
    if (global_scale) {
        p <- p + scale_x_continuous(limits = c(min(sapply(1:length(stm_models), function(x) min(unlist(stm_models[[x]]$semcoh)))),
                                        max(sapply(1:length(stm_models), function(x) max(unlist(stm_models[[x]]$semcoh))))))
        p <- p + scale_y_continuous(limits = c(min(sapply(1:length(stm_models), function(x) min(unlist(stm_models[[x]]$exclusivity)))),
                                        max(sapply(1:length(stm_models), function(x) max(unlist(stm_models[[x]]$exclusivity))))))
    }
    p <- p + labs(x = "Semantic Coherence\n(by run)", y = "Exclusivity")
    p <- p + theme(axis.title = element_text(size = 9),
                   axis.text =  element_text(size = 9))
    return(p)
}

##
## TITLE PAGES
##

## Frontpage title
plot_fp_title <- function(title = "Overview of STM RUNS") {
    p <- ggplot(data.frame(title = title, x = 0, y = 0), aes(x = x, y = y, label = title))
    p <- p + geom_text(size = 7, fontface = "italic") + theme_void()
    ##p <- p + scale_x_continuous(limits = c(0, 1))
    return(p)
}
## p_fp_title <- plot_fp_title()
## p_fp_title

## Frontpage summary (upper left)
plot_fp_summary <- function(project, version, author) {
    p <- ggplot(data.frame(text = paste0("Author/contact:  ", author, "\n",
                                         "Project: ", project, "\n",
                                         "Version: ", version, "\n",
                                         "Printed: ", Sys.Date(), "\n\n"),
                           x = 0, y = 0),
                aes(x = x, y = y, label = text))
    p <- p + geom_text(size = 4, hjust = 0) + theme_void()
    p <- p + scale_x_continuous(limits = c(0, 1))
    return(p)
}
## plot_fp_summary(project = "CLIMDO", version = "V3")
## p_fp_summary <- plot_fp_summary(project = "CLIMDO", version = "V3")

plot_fp_desc <- function(description) {
    p <- ggplot(data.frame(desc = paste0(strwrap(description, width = 70), collapse = "\n"), x = 0, y = 0), aes(x = x, y = y, label = desc))
    p <- p + geom_text(size = 5, fontface = "italic") + theme_void()
    p <- p + scale_y_continuous(limits = c(-1, .5))
    return(p)
}
## p_fp_desc <- plot_fp_desc(description)
## p_fp_desc

##
## SUMMARY
##

## Most frequent words (one for total, one for what is left in the corpus)
plot_freq <- function(processed, out, texts, ngrams = TRUE, n_words = 50) {
    words_proc <- data.frame("words" = processed$vocab[order(out$wordcounts, decreasing = TRUE)][1:n_words],
                             "count" = out$wordcounts[order(out$wordcounts, decreasing = TRUE)][1:n_words],
                             "order" = rev(1:n_words),
                             "type" = "Used in Estimation: Words")
    words_raw_table <- table(strsplit(paste0(tolower(gsub("\\s+", " ", gsub("^\\s+|\\s+$", "", texts))), collapse = " "), " "))
    words_raw <- data.frame("words" = as.character(unlist(attr(words_raw_table, "dimnames")))[order(as.numeric(words_raw_table), decreasing = TRUE)][1:n_words],
                            "count" = as.numeric(words_raw_table)[order(as.numeric(words_raw_table), decreasing = TRUE)][1:n_words],
                            "order" = rev(1:n_words),
                            "type" = "Raw: Words")
    words <- rbind(words_proc, words_raw)
    if (ngrams) {
        ng_2 <- ngrams(tolower(gsub("\\s+", " ", gsub("^\\s+|\\s+$", "", texts))), n = 2)
        ngrams_2_raw <- table(as.character(unlist(lapply(ng_2$all_n$n_2, paste, collapse = " "))))
        ngrams_2 <- data.frame("words" = as.character(unlist(attr(ngrams_2_raw, "dimnames")))[order(as.numeric(ngrams_2_raw), decreasing = TRUE)][1:n_words],
                               "count" = as.numeric(ngrams_2_raw)[order(as.numeric(ngrams_2_raw), decreasing = TRUE)][1:n_words],
                               "order" = rev(1:n_words),
                               "type" = "Raw: Bigrams")
        ng_3 <- ngrams(tolower(gsub("\\s+", " ", gsub("^\\s+|\\s+$", "", texts))), n = 3)
        ngrams_3_raw <- table(as.character(unlist(lapply(ng_3$all_n$n_3, paste, collapse = " "))))
        ngrams_3 <- data.frame("words" = as.character(unlist(attr(ngrams_3_raw, "dimnames")))[order(as.numeric(ngrams_3_raw), decreasing = TRUE)][1:n_words],
                               "count" = as.numeric(ngrams_3_raw)[order(as.numeric(ngrams_3_raw), decreasing = TRUE)][1:n_words],
                               "order" = rev(1:n_words),
                               "type" = "Raw: Trigrams")
        words <- rbind(words, ngrams_2, ngrams_3)
        ##    words$type <- factor(words$type, levels = rev(levels(words$type)))
    }
    p <- ggplot(words, aes(x = "ngram", y = order, label = words))
    p <- p + facet_wrap(~ type, nrow = 1)
    p <- p + geom_text(aes(size = order, colour = count)) + geom_text(aes(x = "count", label = count))
    p <- p + guides(size = FALSE, alpha = FALSE, colour = FALSE)
    p <- p + scale_colour_gradient(low = "grey20", high = "black")
    p <- p + theme(panel.background = element_blank(),
                   panel.grid.major.x = element_blank(),
                   panel.grid.minor.x = element_blank(),
                   panel.grid.minor.y = element_blank(),
                   axis.text.y = element_blank(),
                   axis.line = element_blank(),
                   axis.ticks.y = element_blank(),
                   axis.title.y = element_blank(),
                   strip.background = element_rect(colour = "grey", fill = "grey", size = 3))
    p <- p + labs(x = "Most used ngrams")
    return(p)
}
## p_freq <- plot_freq(processed, out, texts)
## p_freq

## Number of words in each answer
plot_ans_freq <- function(out, texts) {
  ans_raw <- data_frame(freq = sapply(tokenizers::tokenize_words(texts), length),
                        type = "Raw texts",
                        mean = mean(freq),
                        median = median(freq))
  ans_est <- data_frame(freq = as.numeric(sapply(out$documents, function(x) return(sum(x[2, ])))),
                        type = "Used in estimation",
                        mean = mean(freq),
                        median = median(freq))
  ans <- rbind(ans_raw, ans_est)
  p <- ggplot(ans, aes(x = freq, group = type, fill = type)) +
    geom_histogram(alpha = .5, binwidth = 1, position = "identity") +
    geom_vline(aes(xintercept = mean, colour = type), linetype = "dashed") +
    scale_fill_manual(values = c("grey", "orange")) +
    scale_colour_manual(values = c("grey", "orange")) +
    theme_bw() + theme(legend.position = c(.85, .7),
                       legend.background = element_blank(),
                       legend.title = element_blank()) +
    labs(y = "Number of\ndocuments", x = "Length of documents\n(Number of words)")
  return(p)
}

## plot_ans_freq(out, raw_text)

## p_ans_freq <- plot_ans_freq(out, texts)
## p_ans_freq

## Compare semantic coherence
plot_semcoh_all <- function(stm_models) {
    semcoh_all <- lapply(1:length(stm_models), function(mod) {
        semcoh_exc <- data.frame("run" = as.numeric(sapply(1:length(stm_models[[mod]]$exclusivity), rep, times = length(stm_models[[mod]]$exclusivity[[1]]))),
                                 "semcoh" = unlist(stm_models[[mod]]$semcoh), "exc" = unlist(stm_models[[mod]]$exclusivity))
        semcoh_exc$run <- as.factor(semcoh_exc$run)
        semcoh_exc$topics <- paste0(ncol(stm_models[[mod]]$runout[[1]]$theta), " topics")
        return(semcoh_exc)
    })
    semcoh_all <- do.call("rbind", semcoh_all)
    semcoh_all$topics <- factor(semcoh_all$topics, levels = unique(semcoh_all$topics))
    p <- ggplot(semcoh_all, aes(x = semcoh, y = exc, colour = run, label = run))
    p <- p + geom_text() + facet_wrap(~ topics)
    p <- p + theme_bw() + theme(legend.position = c(.75, .15))
    p <- p + labs(x = "Semantic Coherence", y = "Exclusivity", colour = "Run")
    return(p)
}

## Compare semantic coherence
plot_semcoh_all <- function(stm_models) {
  semcoh_all <- lapply(1:length(stm_models), function(mod) {
    semcoh_exc <- data.frame("run" = as.numeric(sapply(1:length(stm_models[[mod]]$exclusivity), rep, times = length(stm_models[[mod]]$exclusivity[[1]]))),
                             "semcoh" = unlist(stm_models[[mod]]$semcoh), "exc" = unlist(stm_models[[mod]]$exclusivity))
    semcoh_exc$run <- as.factor(semcoh_exc$run)
    semcoh_exc$topics <- paste0(ncol(stm_models[[mod]]$runout[[1]]$theta), " topics")
    return(semcoh_exc)
  })
  semcoh_all <- do.call("rbind", semcoh_all)
  semcoh_all$topics <- factor(semcoh_all$topics, levels = unique(semcoh_all$topics))
  p <- ggplot(semcoh_all, aes(x = semcoh, y = exc, colour = run, label = run)) +
    geom_point() + facet_wrap(~ topics) +
    theme_bw() + theme(text = element_text(size = 10),
                       legend.position = c(.75, .15)) +
    labs(x = "Semantic Coherence", y = "Exclusivity", colour = "Run")
  return(p)
}

## p_semcoh_all <- plot_semcoh_all(stm_models)
## p_semcoh_all

## Plot convergence
plot_conv_all <- function(stm_models) {

  conv <- lapply(1:length(stm_models), function(x) {
    out <- lapply(1:length(stm_models[[x]]$runout), function(y) {
      conv <- data.frame("topics" = paste0(ncol(stm_models[[x]]$runout[[y]]$theta), " topics"),
                         "run" = y,
                         "bound" = stm_models[[x]]$runout[[y]]$convergence$bound,
                         "iter" = 1:length(stm_models[[x]]$runout[[y]]$convergence$bound),
                         "conv" = as.factor(ifelse(stm_models[[x]]$runout[[y]]$convergence$converged, "Converged", "Did not converge")))
      return(conv)
    })
    out <- do.call("rbind", out)
    return(out)
  })
  conv <- do.call("rbind", conv)
  conv$conv <- as.character(conv$conv)
  conv$conv[conv$run == "1" & conv$topics == "3 topics"] <- "Did not converge"
  conv$conv <- as.factor(conv$conv)
  conv$run <- as.factor(conv$run)
  p <- ggplot(conv, aes(x = iter, y = bound, group = run, linetype = run, colour = conv)) +
    facet_wrap(~ topics) +
    geom_line() + theme_bw() +
    theme(text = element_text(size = 10),
          legend.position = c(.75, .15)) +
    labs(y = "Approximate Objective", x = "Convergence (iterations)",
         colour = "Convergence", linetype = "Run")
  return(p)
}

## p_conv_all <- plot_conv_all(stm_models)
## p_conv_all

## Plot all FREX
plot_frex_all <- function(stm_models, n = 15) {
  frex <- lapply(1:length(stm_models), function(x) {
    out <- lapply(1:length(stm_models[[x]]$runout), function(y) {
      frex <- data.frame("x" = 0,
                         "topic" = 1:ncol(stm_models[[x]]$runout[[y]]$theta),
                         "frex" = paste0(apply(labelTopics(stm_models[[x]]$runout[[y]], n = n)$frex, 1, paste, collapse = ", ")),
                         "model" = paste0(ncol(stm_models[[x]]$runout[[y]]$theta), " topics"),
                         "run" = paste0("Run ", y))
    })
    out <- do.call("rbind", out)
    return(out)
  })
  frex <- do.call("rbind", frex)
  frex$topic <- factor(as.factor(frex$topic), levels = rev(levels(as.factor(frex$topic))))
  p <- ggplot(frex, aes(x = x, y = topic, label = frex))
  p <- p + facet_grid(model ~ run, scale = "free_y")
  p <- p + geom_text(hjust = 0, size = 2.5) + theme_bw()
  p <- p + scale_x_continuous(limits = c(0, 1), expand = c(.01, .01))
  p <- p + labs(x = "Frex", y = "Topic")
  p <- p + theme(text = element_text(size = 10),
                 panel.background = element_blank(),
                 panel.grid.major.x = element_blank(),
                 panel.grid.minor.x = element_blank(),
                 panel.grid.minor.y = element_blank(),
                 axis.text.x = element_blank(),
                 axis.text.y = element_text(size = 9),
                 axis.line = element_blank(),
                 axis.ticks.x = element_blank(),
                 strip.background = element_rect(colour = "grey", fill = "grey", size = 3))
  return(p)
}
## p_frex_all <- plot_frex_all(stm_models)
## p_frex_all


## Plot all PROB
plot_prob_all <- function(stm_models, n = 15) {
    prob <- lapply(1:length(stm_models), function(x) {
        out <- lapply(1:length(stm_models[[x]]$runout), function(y) {
            prob <- data.frame("x" = 0,
                               "topic" = 1:ncol(stm_models[[x]]$runout[[y]]$theta),
                               "prob" = paste0(apply(labelTopics(stm_models[[x]]$runout[[y]], n = n)$prob, 1, paste, collapse = ", ")),
                               "model" = paste0(ncol(stm_models[[x]]$runout[[y]]$theta), " topics"),
                               "run" = paste0("Run ", y))
        })
        out <- do.call("rbind", out)
        return(out)
    })
    prob <- do.call("rbind", prob)
    prob$topic <- factor(as.factor(prob$topic), levels = rev(levels(as.factor(prob$topic))))
    p <- ggplot(prob, aes(x = x, y = topic, label = prob))
    p <- p + facet_grid(model ~ run, scale = "free_y")
    p <- p + geom_text(hjust = 0, size = 2.5) + theme_bw()
    p <- p + scale_x_continuous(limits = c(0, 1), expand = c(.01, .01))
    p <- p + labs(x = "Highest Probablity", y = "Topic")
    p <- p + theme(panel.background = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.minor.x = element_blank(),
              panel.grid.minor.y = element_blank(),
              axis.text.x = element_blank(),
              axis.text.y = element_text(size = 9),
              axis.line = element_blank(),
              axis.ticks.x = element_blank(),
              strip.background = element_rect(colour = "grey", fill = "grey", size = 3))
    return(p)
}
## p_prob_all <- plot_prob_all(stm_models)
## p_prob_all

## Plot all LIFT
plot_lift_all <- function(stm_models, n = 15) {
    lift <- lapply(1:length(stm_models), function(x) {
        out <- lapply(1:length(stm_models[[x]]$runout), function(y) {
            lift <- data.frame("x" = 0,
                               "topic" = 1:ncol(stm_models[[x]]$runout[[y]]$theta),
                               "lift" = paste0(apply(labelTopics(stm_models[[x]]$runout[[y]], n = n)$lift, 1, paste, collapse = ", ")),
                               "model" = paste0(ncol(stm_models[[x]]$runout[[y]]$theta), " topics"),
                               "run" = paste0("Run ", y))
        })
        out <- do.call("rbind", out)
        return(out)
    })
    lift <- do.call("rbind", lift)
    lift$topic <- factor(as.factor(lift$topic), levels = rev(levels(as.factor(lift$topic))))
    p <- ggplot(lift, aes(x = x, y = topic, label = lift))
    p <- p + facet_grid(model ~ run, scale = "free_y")
    p <- p + geom_text(hjust = 0, size = 2.5) + theme_bw()
    p <- p + scale_x_continuous(limits = c(0, 1), expand = c(.01, .01))
    p <- p + labs(x = "LIFT", y = "Topic")
    p <- p + theme(panel.background = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.minor.x = element_blank(),
              panel.grid.minor.y = element_blank(),
              axis.text.x = element_blank(),
              axis.text.y = element_text(size = 9),
              axis.line = element_blank(),
              axis.ticks.x = element_blank(),
              strip.background = element_rect(colour = "grey", fill = "grey", size = 3))
    return(p)
}
## p_lift_all <- plot_lift_all(stm_models)
## p_lift_all

## Plot all SCORE
plot_score_all <- function(stm_models, n = 15) {
    score <- lapply(1:length(stm_models), function(x) {
        out <- lapply(1:length(stm_models[[x]]$runout), function(y) {
            score <- data.frame("x" = 0,
                               "topic" = 1:ncol(stm_models[[x]]$runout[[y]]$theta),
                               "score" = paste0(apply(labelTopics(stm_models[[x]]$runout[[y]], n = n)$score, 1, paste, collapse = ", ")),
                               "model" = paste0(ncol(stm_models[[x]]$runout[[y]]$theta), " topics"),
                               "run" = paste0("Run ", y))
        })
        out <- do.call("rbind", out)
        return(out)
    })
    score <- do.call("rbind", score)
    score$topic <- factor(as.factor(score$topic), levels = rev(levels(as.factor(score$topic))))
    p <- ggplot(score, aes(x = x, y = topic, label = score))
    p <- p + facet_grid(model ~ run, scale = "free_y")
    p <- p + geom_text(hjust = 0, size = 2.5) + theme_bw()
    p <- p + scale_x_continuous(limits = c(0, 1), expand = c(.01, .01))
    p <- p + labs(x = "SCORE", y = "Topic")
    p <- p + theme(panel.background = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.minor.x = element_blank(),
              panel.grid.minor.y = element_blank(),
              axis.text.x = element_blank(),
              axis.text.y = element_text(size = 9),
              axis.line = element_blank(),
              axis.ticks.x = element_blank(),
              strip.background = element_rect(colour = "grey", fill = "grey", size = 3))
    return(p)
}
## p_score_all <- plot_score_all(stm_models)
## p_score_all

## Compare persons theta
plot_gini_pers_all <- function(stm_models) {
    gini <- lapply(1:length(stm_models), function(x) {
        out <- lapply(1:length(stm_models[[x]]$runout), function(y) {
            out_inner <- data.frame("gini" = apply(stm_models[[x]]$runout[[y]]$theta, 1, ineq, type = "Gini"),
                              "topics" = paste0(ncol(stm_models[[x]]$runout[[y]]$theta), " topics"),
                              "run" = y)
            out_inner$gini_mean <- mean(out_inner$gini)
            return(out_inner)
        })
        out <- do.call("rbind", out)
        return(out)
    })
    gini <- do.call("rbind", gini)
    gini$run <- as.factor(gini$run)
    p <- ggplot(gini, aes(x = run, y = gini))
    p <- p + facet_wrap(~ topics)
    p <- p + geom_boxplot(outlier.size = .5) + theme_bw() + coord_flip()
    p <- p + labs(x = "Run", y = "Gini (inequality) of topics within documents")
    return(p)
}
## p_gini_pers_all <- plot_gini_pers_all(stm_models)
## p_gini_pers_all

## Gini theta
plot_gini_theta_all <- function(stm_models, fit) {
    gini <- lapply(1:length(stm_models), function(x) {
        out <- lapply(1:length(stm_models[[x]]$runout), function(y) {
            out_inner <- data.frame("gini" = apply(stm_models[[x]]$runout[[y]]$theta, 2, ineq, type = "Gini"),
                                    "topic" = 1:ncol(stm_models[[x]]$runout[[y]]$theta),
                                    "topics" = paste0(ncol(stm_models[[x]]$runout[[y]]$theta), " topics"),
                                    "run" = paste0("Run ", y))
            return(out_inner)
        })
        out <- do.call("rbind", out)
        return(out)
    })
    gini <- do.call("rbind", gini)
    gini$topic <- factor(as.factor(gini$topic), levels = rev(levels(as.factor(gini$topic))))
    p <- ggplot(gini, aes(y = topic, x = gini, label = topic))
    p <- p + facet_grid(topics ~ run, scales = "free_y")
    p <- p + geom_text(size = 3) + theme_bw()
    p <- p + labs(x = "Gini (inequality) of documents within topics", y = "Topic")
    p <- p + scale_x_continuous(limits = c(0, .75))
    ##p <- p + geom_segment(aes(yend = topic, x = 0, xend = gini))
    p <- p + theme(axis.title = element_text(size = 9),
                   axis.text =  element_text(size = 9))
    return(p)
}
## p_gini_theta_all <- plot_gini_theta_all(stm_models, fit)
## p_gini_theta_all


##
## ASSOICATED WORDS
##

get_ass <- function(texts, se, cut, n_words) {
    tex <- tolower(gsub("\\s+", " ", gsub("^\\s+|\\s+$", "", texts)))
    message("", appendLF = FALSE)
    le <- lapply(tex, function(x) {
        if (any(grepl(se, x))) {
            te <- as.character(strsplit(gsub("[^A-Za-zøæåØÆÅ ]", "", x), " ")[[1]])
            mark <- grep(se, te)[1]
            up <- ifelse((mark + cut) <= length(te), mark + cut, length(te))
            low <- ifelse((mark - cut) >= 1, mark - cut, 1)
            out <- paste0(te[low:up][!grepl(se, te[low:up])], collapse = " ")
            return(out)
        } else {
            return(NA)
        }
    })
    te <- unlist(le)[!is.na(unlist(le))]
    te <- unlist(strsplit(te, " "))
    words_raw_table <- table(te)
    fe <- data.frame("words" = as.character(unlist(attr(words_raw_table, "dimnames")))[order(as.numeric(words_raw_table), decreasing = TRUE)][1:n_words],
                     "count" = as.numeric(words_raw_table)[order(as.numeric(words_raw_table), decreasing = TRUE)][1:n_words],
                     "search" = se,
                     "order" = rev(1:n_words))
    return(fe)
}

plot_ass_words <- function(texts, out, processed, cut = 3, n_words_se = 16, n_words_ass = 16) {
    le <- lapply(processed$vocab[order(out$wordcounts, decreasing = TRUE)][1:n_words_se], get_ass, texts = texts, n_words = n_words_ass, cut = cut)
    le <- do.call("rbind", le)
    p <- ggplot(le, aes(x = "word", y = order, label = words))
    p <- p + facet_wrap(~ search, scales = "free_y")
    p <- p + geom_text(aes(size = order, colour = count)) + geom_text(aes(x = "count", label = count))
    p <- p + guides(size = FALSE, alpha = FALSE, colour = FALSE)
    p <- p + scale_colour_gradient(low = "grey20", high = "black")
    p <- p + theme(panel.background = element_blank(),
                   panel.grid.major.x = element_blank(),
                   panel.grid.minor.x = element_blank(),
                   panel.grid.minor.y = element_blank(),
                   axis.text.y = element_blank(),
                   axis.line = element_blank(),
                   axis.ticks.y = element_blank(),
                   axis.title.y = element_blank(),
                   strip.background = element_rect(colour = "grey", fill = "grey", size = 3))
    p <- p + labs(x = paste0(strwrap(paste0("Most associated words for the ", n_words_se, " most frequent words in the corpus (used for estimation, i.e. after processing). A word is counted if it is within ", cut, " words in the raw texts."), width = 80), collapse = "\n"))
    return(p)
}
## p_ass_words <- plot_ass_words(texts, out, processed)
## p_ass_words

## Expected Doc length by topic

plot_top_length <- function(stm_models, out) {
    wcim <- function(x, weight, digits = 5) {
        mean <- weighted.mean(x, weight)
        error <- (qnorm(0.975) * sqrt(Hmisc::wtd.var(x, weight)))/sqrt(length(weight))
        low <- round((mean - error), digits = digits)
        hi <- round((mean + error), digits = digits)
        return(c(round(mean, digits = digits), low, hi))
    }
    te <- as.numeric(unlist(sapply(out$documents, function(y) sum(y[2, ]))))
    le <- lapply(1:length(stm_models), function(x) {
        out_outer <- lapply(1:length(stm_models[[x]]$runout), function(y) {
            le <- lapply(1:ncol(stm_models[[x]]$runout[[y]]$theta), function(z) {
                out_inner <- wcim(te, stm_models[[x]]$runout[[y]]$theta[, z])
                out_inner <- c(out_inner, y, z)
                return(out_inner)
            })
            le <- as.data.frame(do.call("rbind", le))
            names(le) <- c("mean", "low", "hi", "run", "topic")
            le$topics <- paste0(ncol(stm_models[[x]]$runout[[1]]$theta), " topics")
            return(le)
        })
        out_outer <- do.call("rbind", out_outer)
        out_outer$run <- paste0("Run ", out_outer$run)
        return(out_outer)
    })
    le <- do.call("rbind", le)
    le$topics <- factor(as.factor(le$topics), levels = levels(as.factor(le$topics))[order(as.numeric(gsub("\\D", "", levels(as.factor(le$topics)))))])
    le$topic <- factor(as.factor(le$topic), levels = max(as.numeric(levels(as.factor(le$topic)))):min(as.numeric(levels(as.factor(le$topic)))))
    p <- ggplot(le, aes(x = mean, xmin = low, xmax = hi, y = topic))
    p <- p + facet_grid(topics ~ run, scales = "free_y")
    p <- p + geom_errorbarh(height = 0) + geom_point() + theme_bw()
    p <- p + labs(y = "Topic", x = "Expected Document Length")
    return(p)
}
##p_top_length <- plot_top_length(stm_models, out)
##p_top_length

## Expected Topic prop all
plot_prop_all <- function(stm_models) {
    prop <- lapply(1:length(stm_models), function(x) {
        out_outer <- lapply(1:length(stm_models[[x]]$runout), function(y) {
            prop <- as.data.frame(stm_models[[x]]$runout[[y]]$theta)
            names(prop) <- 1:ncol(stm_models[[x]]$runout[[y]]$theta)
            prop <- tidyr::gather(prop, "topic", "prop")
            prop$topic <- as.numeric(prop$topic)
            prop$topics <- ncol(stm_models[[x]]$runout[[y]]$theta)
            prop$run <- y
            return(prop)
        })
        out_outer <- do.call("rbind", out_outer)
        out_outer$run <- paste0("Run ", out_outer$run)
        out_outer$topics <- paste0(out_outer$topics, " topics")
        return(out_outer)
    })
    prop <- do.call("rbind", prop)
    prop$topics <- factor(as.factor(prop$topics), levels = levels(as.factor(prop$topics))[order(as.numeric(gsub("\\D", "", levels(as.factor(prop$topics)))))])
    prop$topic <- factor(prop$topic, levels = rev(levels(factor(prop$topic))[order(as.numeric(levels(factor(prop$topic))))]))
    prop <- prop %>%
      group_by(topic, topics, run) %>%
        summarize(mean = mean(prop),
                  ci = ci(prop),
                  upper = mean + ci,
                  lower = mean - ci)
    p <- ggplot(prop, aes(y = topic, x = mean, xmax = upper, xmin = lower))
    p <- p + facet_grid(topics ~ run, scales = "free_y")
    p <- p + theme_bw() + scale_y_discrete()
    p <- p + geom_text(aes(label = topic)) + guides(colour = FALSE)
    p <- p + labs(y = "Topic", x = "Expected Topic Proportion\n")
    p <- p + theme(axis.title = element_text(size = 9),
                   axis.text =  element_text(size = 9))
    ## p <- p + geom_segment(aes(yend = topic, x = 0, xend = mean))
    return(p)
}
## p_prop_all <- plot_prop_all(stm_models)
## p_prop_all

## Get overview of topics/runs
get_view <- function(stm_models) {
    view <- lapply(stm_models, function(x) {
                       runs <- length(x$runout)
                       topics <- ncol(x$runout[[1]]$theta)
                       out <- data.frame("topics" = topics, "runs" = runs)
                       return(out)
                   })
    view <- do.call("rbind", view)
    return(view)
}

##
## Plot STM Models ------------------------------------------------------------------------------
##

## Prints a document that gives an overview of several STM runs.

## stm_models = list of stm models
## texts = texts used in models (as accepted by "findThoughts")
## processed = output from textProcessor(). Only needed if summary == TRUE.
## out = output from prepDecoments(). Only needed if summary == TRUE.
## summary = if to print summary. If FALSE, it will only print the one-page overview for each topic/run. If so, out and processed must also be provided
## path = path to output file. Must end with ".pdf"
## type = "normal" == with a fewer text, but more plots; "text" == just a really narrow frex at the top and then mostly text
## verbose = if TRUE it will display messages indicating how for it has come in the printing process
## title = title on the frontpage
## author = displayed after "author/contact" on the titlepage
## project = display after "project" on the titlepage
## version = displayed after "version" on the titlepage
## discription = text displayed underthe title on the titlepage

plot_stm_models <- function(stm_models, texts,
                            processed = NULL, out = NULL, summary = TRUE,
                            path = "explore_stm.pdf", type = "normal", verbose = TRUE,
                            title = "Overview of Structural Topic Model (STM) runs", author = "", project = "", version = "", description = "") {

    view <- get_view(stm_models)

    pdf(path, width = 8.27, height = 11.69)

    ## Titlepage
    p_fp_title <- plot_fp_title(title = title)
    p_fp_desc <- plot_fp_desc(description)
    p_fp_summary <- plot_fp_summary(author = author, project = project, version = version)
    grid.arrange(p_fp_summary, p_fp_title, p_fp_desc, ncol = 1, heights = c(1/6, 1/6, 4/6))

    if (summary) {
        if (verbose) message("On summary pages")

        ## Overal corpus summary
        if (verbose) message(".", appendLF = FALSE)
        p_freq <- plot_freq(processed, out, texts)
        p_ans_freq <- plot_ans_freq(out, texts)
        grid.arrange(p_ans_freq, p_freq, ncol = 1, heights = c(1/5, 4/5))

        ## Associated words
        if (verbose) message(".", appendLF = FALSE)
        p_ass_words <- plot_ass_words(texts, out, processed)
        grid.arrange(p_ass_words)

        ## Semcoh/exc and convergence
        if (verbose) message(".", appendLF = FALSE)
        p_semcoh_all <- plot_semcoh_all(stm_models)
        p_conv_all <- plot_conv_all(stm_models)
        grid.arrange(p_conv_all, p_semcoh_all, ncol = 1)

        ## Frex and gini
        if (verbose) message(".", appendLF = FALSE)
        p_gini_pers_all <- plot_gini_pers_all(stm_models)
        p_gini_theta_all <- plot_gini_theta_all(stm_models, fit)
        grid.arrange(p_gini_pers_all, p_gini_theta_all, heights = c(1/3, 2/3))

        ## Exp. doc. length by topic and exp. topic. prop.
        if (verbose) message(".", appendLF = FALSE)
        p_top_length <- plot_top_length(stm_models, out)
        grid.arrange(p_top_length)
        p_prop_all <- plot_prop_all(stm_models)
        grid.arrange(p_prop_all)

        # Frex grid
        if (verbose) message(".", appendLF = FALSE)
        p_frex_all <- plot_frex_all(stm_models)
        grid.arrange(p_frex_all)
        p_prob_all <- plot_prob_all(stm_models)
        grid.arrange(p_prob_all)
        p_lift_all <- plot_lift_all(stm_models)
        grid.arrange(p_lift_all)
        p_score_all <- plot_score_all(stm_models)
        grid.arrange(p_score_all)
    }

    ## Plot each run
    for (n_model in 1:length(stm_models)) {
        on_topic <- ncol(stm_models[[n_model]]$runout[[1]]$theta)
        if (verbose) message(paste0("\nOn model ", n_model, " (", on_topic, " topics)."))
        for (n_run in 1:length(stm_models[[n_model]]$runout)) {
            if (verbose) message(".", appendLF = FALSE)

            ## Make all plots
            fit <- stm_models[[n_model]]$runout[[n_run]]
            p_title  <- plot_title(fit, n_run)
            p_over <- plot_overview(view, fit, n_run)
            p_frex <- plot_frex(fit)
            p_semcoh <- plot_semcoh(stm_models, n_model, n_run)
            p_gini_theta <- plot_gini_theta(fit)
            p_prop <- plot_prop(fit)
            p_corr <- plot_corr(fit)
            p_texts <- plot_texts(fit, texts)

            ## Arrange plots
            if (type == "normal") {
                p_left <- arrangeGrob(arrangeGrob(p_title, p_over, nrow = 1), p_frex, ncol = 1, heights = c(1/3, 2/3))
                p_right <- arrangeGrob(p_corr, p_semcoh, p_prop, p_gini_theta, nrow = 2)
                p_top <- arrangeGrob(p_left, p_right, ncol = 2)
                p_out <- arrangeGrob(p_top, p_texts, ncol = 1, heights = c(3/12, 9/12))
            } else if (type == "text") {
                p_top <- arrangeGrob(p_title, p_over, (p_frex + theme(axis.title = element_blank())),
                                     ncol = 3, widths = c(1/4, 1/4, 1/2))
                p_out <- arrangeGrob(p_top, p_texts, ncol = 1, heights = c(1/12, 11/12))
            } else {
                stop("Type must be either 'normal' or 'text'")
            }

            ## Print them
            grid.arrange(p_out)
        }
    }
    dev.off()
}

## description <- "This is part of an experiment examening how opinion polls affect public opinion. The corpus consists of open-ended answers in the Norwegian Citizen Panel (wave 5) about Syrian refugees. It was asked as a follow-up to a closed question about whether Norway should accept more refugees than the (then) decided amount. About half of the respondents where shown an opinion poll-style graph showing how previous respondents answered during the closed-ended question."

## plot_stm_models(stm_models, processed = processed, out = out, texts = texts, author = "Mikael Johannesson (mikael.johannesson@uib.no)", version = "V1", project = "POLLS", description = description)
